import os
from pandas import DataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import preprocessing
from sklearn.decomposition import PCA
from scipy.stats import zscore
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
def preparation(data_path, target_path):
# load data
df_raw = pd.read_csv(data_path, index_col=False)
df_raw = df_raw.set_index(['GO term', 'category', 'description'])
df_raw_target = pd.read_csv(target_path)
# Sum feature
df_raw['total'] = df_raw.sum(axis=1) / (len(df_raw.index))
# Sort feature by sum of numbers
df_raw = df_raw.sort_values('total', ascending=True)
df_raw = df_raw.drop(columns=['total'])
return df_raw, df_raw_target
def normalizing(df_raw, df_raw_target):
# normalizing by MinMax Scaler
min_max_scaler = preprocessing.MinMaxScaler()
scaled_array = min_max_scaler.fit_transform(df_raw)
df_norm_minmax = pd.DataFrame(scaled_array, columns=df_raw_target.Sample, index=df_raw.index)
# normalizing by zscore
df_norm_zscore = df_raw.apply(zscore)
data_list = [df_raw, df_norm_minmax, df_norm_zscore]
# Prepare data
data_raw = df_raw.T
data_minmax = df_norm_minmax.T
data_zscore = df_norm_zscore.T
#target
y = df_raw_target
#feature
x1 = data_raw.values
x2 = data_minmax.values
x3 = data_zscore.values
# Input
feature = [x1, x2, x3]
return data_list, feature, y
def calculate_braycurtis(data):
df = squareform(pdist(data.T, metric='braycurtis'))
return df
def calculate_PCA(data, y):
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(data)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1', 'principal component 2', 'principal component 3'])
finalDf = pd.concat([principalDf, pd.DataFrame(y)], axis=1)
return finalDf
def visualize_PCA_2D(finalDf):
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = finalDf.Study.unique()
colors = ['b', 'w', 'r', 'c', 'm', 'y', 'k', 'g']
for target, color in zip(targets,colors):
indicesToKeep = finalDf['Study'] == target
ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
, finalDf.loc[indicesToKeep, 'principal component 2']
, c = color
, s = 50
, alpha=0.5)
ax.legend(targets)
ax.grid()
return plt.show()
def plot_heatmap(df, title):
plt.figure(figsize = (10,18))
sns.set(font_scale=0.7)
plt.title(title, fontsize =20)
sns.heatmap(df,
yticklabels=df.index.get_level_values(2),
cbar_kws={"orientation": "horizontal"})
data_path = 'output/df_raw.csv'
target_path = 'output/df_raw_target.csv'
df_raw, df_raw_target = preparation(data_path, target_path)
data_list, feature, y = normalizing(df_raw, df_raw_target)
data_list[0].T.describe()
for count, i in enumerate(data_list):
label = ['raw', 'maxmin', 'zscore']
select = i.T#.describe()
#select.loc["count"]
#select.iloc[:,0].plot.hist()
select.plot.hist(bins=50, alpha=0.5, legend=False, title=label[count])
dataframe = [[data_list[0], "Not Processed"],
[data_list[1], "MinMax Scaled"],
[data_list[2], "ZScore Scaled"],
]
for i in dataframe:
plot_heatmap(i[0], i[1])
for i in feature:
finalDf = calculate_PCA(i, y)
visualize_PCA_2D(finalDf)
df_braycurtis = []
for i in data_list:
x = calculate_braycurtis(i)
df_braycurtis.append(x)
for i in df_braycurtis:
plt.figure(figsize = (5,5))
sns.set(font_scale=0.7)
#plt.title("title", fontsize =20)
# Generate a mask for the upper triangle
mask = np.zeros_like(i, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(i,
mask=mask,
#yticklabels=df.index.get_level_values(2),
cbar_kws={"orientation": "horizontal"})
finalDf = calculate_PCA(feature[2], y)
# Set style of scatterplot
sns.set_context("notebook", font_scale=1.1)
sns.set_style("whitegrid")
g = sns.lmplot(x="principal component 1",
y="principal component 2",
data=finalDf,
legend=True,
palette="Set2",
fit_reg=False,
height=10,
hue='Study',
scatter_kws={"s":150, "alpha":0.7})
plt.title('PCA Results', weight='bold').set_fontsize('14')
plt.xlabel('Principal Component 1', weight='bold').set_fontsize('10')
plt.ylabel('Principal Component 2', weight='bold').set_fontsize('10')
from sklearn.manifold import TSNE
from time import time
perplexities = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
n_components = 2
tsne_result = []
X = feature[1]
for i, perplexity in enumerate(perplexities):
t0 = time()
tsne = TSNE(n_components=n_components, init='random',
random_state=0, perplexity=perplexity)
Y = tsne.fit_transform(X)
tsne_result.append(Y)
t1 = time()
print("z-score, perplexity=%d in %.2g sec" % (perplexity, t1 - t0))
tSNE_df = pd.DataFrame(data = tsne_result[i], columns = ['component_1', 'component_2'])
tSNE_df = pd.concat([tSNE_df, pd.DataFrame(y)], axis=1)
# Set style of scatterplot
sns.set_context("notebook", font_scale=1.1)
sns.set_style("whitegrid")
g = sns.lmplot(x="component_1",
y="component_2",
data=tSNE_df,
legend=True,
palette="Set2",
fit_reg=False,
height=10,
hue='Study',
scatter_kws={"s":150, "alpha":0.7})
plt.title('tSNE Result, perplexity= %d' % (perplexity), weight='bold').set_fontsize('14')
plt.xlabel('Component 1', weight='bold').set_fontsize('10')
plt.ylabel('Component 2', weight='bold').set_fontsize('10')
plt.show()
perplexities = [2, 5, 30, 50, 100]
n_components = 2
tsne_result = []
X = df_braycurtis[2]
for i, perplexity in enumerate(perplexities):
t0 = time()
tsne = TSNE(n_components=n_components, init='random',
metric="precomputed", random_state=0, perplexity=perplexity)
Y = tsne.fit_transform(X)
tsne_result.append(Y)
t1 = time()
print("z-score, perplexity=%d in %.2g sec" % (perplexity, t1 - t0))
tSNE_df = pd.DataFrame(data = tsne_result[i], columns = ['component_1', 'component_2'])
tSNE_df = pd.concat([tSNE_df, pd.DataFrame(y)], axis=1)
# Set style of scatterplot
sns.set_context("notebook", font_scale=1.1)
sns.set_style("whitegrid")
g = sns.lmplot(x="component_1",
y="component_2",
data=tSNE_df,
legend=True,
palette="Set2",
fit_reg=False,
height=10,
hue='Study',
scatter_kws={"s":150, "alpha":0.7})
plt.title('tSNE Result, perplexity= %d' % (perplexity), weight='bold').set_fontsize('14')
plt.xlabel('Component 1', weight='bold').set_fontsize('10')
plt.ylabel('Component 2', weight='bold').set_fontsize('10')
plt.show()